scRNAseq: Bioconductor

DropletUtils: Bioconductor

scater: Bioconductor, GitHub, Paper

ggbeeswarm: CRAN, GitHub

Demo Dataset: BachMammaryData from Differentiation dynamics of mammary epithelial cells revealed by single-cell RNA sequencing Nat Commun. 2017;8(1):2128.

License: GPL-3.0

Start R & prepare environment

Start R

cd /ngs/scRNA-seq-Analysis-Demo

R

Load R libraries

suppressPackageStartupMessages({
    library(scRNAseq)
    library(scater)
    library(ggbeeswarm) # geom_quasirandom
})

Set up colour palettes

c30 <- c("#1C86EE", #1 dodgerblue2
         "#FF0000", #2 red1
         "#008B00", #3 green4
         "#FF7F00", #4 DarkOrange1
         "#00FF00", #5 green1
         "#A020F0", #6 purple
         "#0000FF", #7 blue1
         "#FF1493", #8 DeepPink1
         "#8B4500", #9 DarkOrange4
         "#000000", #10 black
         "#FFD700", #11 gold1
         "#00CED1", #12 DarkTurquoise
         "#68228B", #13 DarkOrchid4
         "#FF83FA", #14 orchid1
         "#B3B3B3", #15 gray70
         "#B03060", #16 maroon
         "#7CCD7C", #17 PaleGreen3
         "#333333", #18 gray20
         "#D8BFD8", #19 thistle
         "#FFC125", #20 goldenrod1
         "#EEE685", #21 khaki2
         "#7EC0EE", #22 SkyBlue2
         "#36648B", #23 SteelBlue4
         "#54FF9F", #24 SeaGreen1
         "#8B8B00", #25 yellow4
         "#CDCD00", #26 yellow3
         "#F08080", #27 LightCoral
         "#A52A2A", #28 brown
         "#00008B", #29 blue4
         "#CD2626"  #30 firebrick3
)

pie(rep(1,30), col = c30, radius = 1)

Choose sample & cluster colours

# Choosing colours for samples, n = 8
sample_col <- c30[c(1:8)]

# Choosing colours for samples, n = 15
cluster_col <- c30[c(1:15)]

Load scRNA-seq data

Import public scRNA-seq data

In this example, we will use the example data set from the scRNAseq Bioconductor package. It contains expression matrices for several public scRNA-seq datasets in the form of SingleCellExperiment objects. The BachMammaryData function will download and import the mouse mammary gland single-cell RNA-seq data obtained with the 10x Genomics Chromium platform from Bach et al. (2017). The object contains 25,806 barcodes, cell annotations that includes the sample ID and condition, and the gene annotations that includes the Ensembl gene ID and gene symbol.

sce <- BachMammaryData()
sce
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(1): counts
## rownames: NULL
## rowData names(2): Ensembl Symbol
## colnames: NULL
## colData names(3): Barcode Sample Condition
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
## [1] "Number of genes: 27998"
paste0("Number of cells: ", ncol(sce))
## [1] "Number of cells: 25806"
# Cell info
str(colData(sce))
## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   ..@ rownames       : NULL
##   ..@ nrows          : int 25806
##   ..@ listData       :List of 3
##   .. ..$ Barcode  : chr [1:25806] "AAACCTGAGGCCATAG-1" "AAACCTGCATCGGGTC-1" "AAACGGGTCAAACGGG-1" "AAAGATGAGATAGCAT-1" ...
##   .. ..$ Sample   : chr [1:25806] "NP_1" "NP_1" "NP_1" "NP_1" ...
##   .. ..$ Condition: Named chr [1:25806] "Nulliparous" "Nulliparous" "Nulliparous" "Nulliparous" ...
##   .. .. ..- attr(*, "names")= chr [1:25806] "NP" "NP" "NP" "NP" ...
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()
# Gene info
str(rowData(sce))
## Formal class 'DFrame' [package "S4Vectors"] with 6 slots
##   ..@ rownames       : NULL
##   ..@ nrows          : int 27998
##   ..@ listData       :List of 2
##   .. ..$ Ensembl: chr [1:27998] "ENSMUSG00000051951" "ENSMUSG00000089699" "ENSMUSG00000102343" "ENSMUSG00000025900" ...
##   .. ..$ Symbol : chr [1:27998] "Xkr4" "Gm1992" "Gm37381" "Rp1" ...
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()

Import Cell Ranger data

Note: Skip if using the BachMammaryData dataset.

# Define sample ID
sample_id <- c("Sample1", "Sample2", "Sample3", "Sample4")

To import scRNA-seq data generated by Cell Ranger, we can using the read10xCounts function from the DropletUtils package, which will produce a SingleCellExperiment object containing count data for each gene (row) and cell (column) across all samples.

Import 10X data

A. From matrix files

Edit the script below to point cr_mat_path to the directory containing CellRanger output files: barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz.

# Define data location
cr_mat_path <- "path-to-cellranger-output-folder" # a folder

# Check if these files exist: barcodes.tsv.gz, features.tsv.gz, matrix.mtx.gz
check_matrix_input <- function(cr_mat_path) {
    error <- FALSE
    if(! file.exists(paste0(cr_mat_path,"/barcodes.tsv.gz"))) {
            error <- TRUE
            print("'barcodes.tsv.gz' not found")
    }

    if(! file.exists(paste0(cr_mat_path,"/features.tsv.gz"))) {
            error <- TRUE
            print("'features.tsv.gz' not found")
    }

    if(! file.exists(paste0(cr_mat_path,"/matrix.mtx.gz"))) {
            error <- TRUE
            print("'matrix.mtx.gz' not found")
    } 

    if(isTRUE(error)) {
            stop("Stopped!")
    } else {
            print("All files found.")
    }       
}

check_matrix_input(cr_mat_path)

# Import data
sce <- DropletUtils::read10xCounts(cr_mat_path, sample.names = sample_id)
sce

# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))

B. From HDF5 files

Edit the script below to point cr_h5_path to the path of the filtered_feature_bc_matrix.h5 file.

# Define data location
cr_h5_file <- "path-to-cellranger-output-folder/filtered_feature_bc_matrix.h5" # a h5 file

# Check if this files exist
file.exists(cr_h5_file)

# Import data
sce <- DropletUtils::read10xCounts(cr_h5_file, sample.names = sample_id)
sce

# Print the number of genes and cells in the object
paste0("Number of genes: ", nrow(sce))
paste0("Number of cells: ", ncol(sce))

Add cell and gene annotations

Note: Skip if using the BachMammaryData dataset.

colnames(sce) <- colData(sce)$Barcode
rownames(sce) <- rowData(sce)$Symbol

# Cell info
str(colData(sce))

# Gene info
str(rowData(sce))

Add average expression

We use the calculateAverage function to average counts per feature after normalizing with size factors

# 'calcAverage' is deprecated.
ave <- calculateAverage(sce)
rowData(sce)$ave_counts <- ave

Add log2 CPM to sce

In addition to the count data in assays, we will also add the log2 normalised count-per-million (CPM) values using exprs.

# Assay info
str(assays(sce))
## Formal class 'SimpleList' [package "S4Vectors"] with 4 slots
##   ..@ listData       :List of 1
##   .. ..$ counts:Formal class 'dgCMatrix' [package "Matrix"] with 6 slots
##   .. .. .. ..@ i       : int [1:61219942] 13 20 24 29 37 38 39 49 51 60 ...
##   .. .. .. ..@ p       : int [1:25807] 0 3729 7030 10257 14028 17655 19573 22242 25078 27312 ...
##   .. .. .. ..@ Dim     : int [1:2] 27998 25806
##   .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. ..$ : NULL
##   .. .. .. .. ..$ : NULL
##   .. .. .. ..@ x       : num [1:61219942] 1 2 6 1 1 4 1 1 1 1 ...
##   .. .. .. ..@ factors : list()
##   ..@ elementType    : chr "ANY"
##   ..@ elementMetadata: NULL
##   ..@ metadata       : list()
# Accessing the assay data
dim(assay(sce, "counts"))
## [1] 27998 25806

Note: The use_size_factors parameter in calculateCPM is deprecated, use size_factors instead. The default of the size_factors parameter is NULL, and this will use library sizes directly.

Note: A pseudocount of 1 is added to avoid undefined values after the log-transformation.

# More about lib.sizes calculation in calculateCPM()
# x is a numeric matrix of counts
x <- assay(sce, "counts")
size.factors <- colSums(x)/mean(colSums(x))     # same as scater::librarySizeFactors(x)
lib.sizes <- colSums(x) / 1e6                   # count-per-million
lib.sizes <- size.factors / mean(size.factors) * mean(lib.sizes) # normalisation

Add log2 CPM.

pseudocount = 1
exprs(sce) <- log2(calculateCPM(sce) + pseudocount) # See note

Print object info.

sce
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(2): counts logcounts
## rownames: NULL
## rowData names(3): Ensembl Symbol ave_counts
## colnames: NULL
## colData names(3): Barcode Sample Condition
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):

Generate QC metrics

Define mitochondrial genes

We use grep to perform pattern matching to look for genes that are from the mitochondrial genome (chrM). The regular expression "^mt-|^MT-" will work for both human (MT-) and mouse (mt-) mitochondrial genomes. The ^ anchor is to ensure the pattern is matched to the beginning of the gene symbol.

# The subset feature can be supplied as character vector of feature names, a logical vector,
# or a numeric vector of indices
#is.mito <- grepl("^mt-|^MT-", rowData(sce)$Symbol) # a logical vector
is.mito <- grep("^mt-|^MT-", rowData(sce)$Symbol)   # numeric vector of indices

# Print matched genes
rowData(sce)$Symbol[is.mito]
##  [1] "mt-Nd1"  "mt-Nd2"  "mt-Co1"  "mt-Co2"  "mt-Atp8" "mt-Atp6" "mt-Co3"  "mt-Nd3" 
##  [9] "mt-Nd4l" "mt-Nd4"  "mt-Nd5"  "mt-Nd6"  "mt-Cytb"

Add QC metrics

With an older version of scater, we would use the calculateQCMetrics function to add QC metrics, but it is now deprecated. We will instead use addPerCellQC and addPerFeatureQC and add additional metrics to colData and rowData instead.

Add per-cell QC

# "sum" - sum of counts for the cell (library size)
# "detected" - number of genes for the cell that have counts above the detection limit (default 0)
sce <- addPerCellQC(sce, list(MT = is.mito))

# Add additional stats to per-cell QC
colData(sce)$log10_sum <- log10(colData(sce)$sum + pseudocount)
colData(sce)$log10_detected <- log10(colData(sce)$detected + pseudocount)
colData(sce)$log10_genes_per_umi <- colData(sce)$log10_sum / colData(sce)$log10_detected

Add per-feature QC

# "mean" - mean counts for each gene across all cells
# "detected" - percentage of cells with non-zero counts for each gene
sce <- addPerFeatureQC(sce)

# Add additional stats to per-feature QC
rowData(sce)$is_MT <- FALSE
rowData(sce)$is_MT[is.mito] <- TRUE
rowData(sce)$log10_mean <- log10(rowData(sce)$mean + pseudocount)
rowData(sce)$n_cells <- nexprs(sce, byrow = TRUE)                 # number of expressing cells
rowData(sce)$fq_n_cells <- rowData(sce)$n_cells / ncol(sce) * 100 # percentage of expressing cells
rowData(sce)$pct_dropout <- 100 - rowData(sce)$detected # percentage of cells with zero counts
rowData(sce)$total_counts <- rowData(sce)$mean * ncol(sce)
rowData(sce)$log10_total_counts <- log10(rowData(sce)$total_counts + pseudocount)
sce
## class: SingleCellExperiment 
## dim: 27998 25806 
## metadata(0):
## assays(2): counts logcounts
## rownames: NULL
## rowData names(12): Ensembl Symbol ... total_counts log10_total_counts
## colnames: NULL
## colData names(16): Barcode Sample ... log10_detected log10_genes_per_umi
## reducedDimNames(0):
## spikeNames(0):
## altExpNames(0):
# Print column names from the gene and cell data
names(colData(sce))
##  [1] "Barcode"             "Sample"              "Condition"          
##  [4] "sum"                 "detected"            "percent_top_50"     
##  [7] "percent_top_100"     "percent_top_200"     "percent_top_500"    
## [10] "subsets_MT_sum"      "subsets_MT_detected" "subsets_MT_percent" 
## [13] "total"               "log10_sum"           "log10_detected"     
## [16] "log10_genes_per_umi"
names(rowData(sce))
##  [1] "Ensembl"            "Symbol"             "ave_counts"         "mean"              
##  [5] "detected"           "is_MT"              "log10_mean"         "n_cells"           
##  [9] "fq_n_cells"         "pct_dropout"        "total_counts"       "log10_total_counts"

Identify low-quality cells

In most cases, We can assume most of the cells are high quality and use the median absolute deviation (MAD) from the median approach to identify cells that are outliers, presumably representing low-quality cells. The threshold used here to determined if a cell is an outlier is if it is more than 3 MADs from the median, under a normal distribution this cutoff will retain 99% of cells. The log-transformed values will be applied to the input values (with log = TRUE) as this improves resolution at small values for distribution exhibit a heavy right tail, and avoid inflation of the MAD that might compromise outlier detection on the left tail.

Note: One should be aware of factors that could affect the distribution, for example certain cells can have a high metabolic rate and thus higher mitochondrial gene expression, or certain cells can express very few genes. In such cases, one might need to use fixed thresholds to filter low-quality cells.

qc.lib <- isOutlier(colData(sce)$sum, nmads = 3, log = TRUE, type = "lower")
qc.expr <- isOutlier(colData(sce)$detected, nmads = 3, log = TRUE, type = "lower")
qc.mito <- isOutlier(colData(sce)$subsets_MT_percent, nmads = 3, type = "higher")

View the filter thresholds and determine if they are appropriate.

attr(qc.lib, "thresholds")  # values lower than the "lower" threshold would be filtered
##    lower   higher 
## 924.5199      Inf
attr(qc.expr, "thresholds") # values lower than the "lower" threshold would be filtered
##    lower   higher 
## 582.1861      Inf
attr(qc.mito, "thresholds") # values higher than the "higher" threshold would be filtered
##    lower   higher 
##     -Inf 1.632992

Note: The qc.mito threshold is very low, i.e. it will remove cells with mitochondrial proportions greater than ~1.63%. So in this case, we will set a fixed threshold of 10%.

qc.mito.threshold = 10 # set at 10%
qc.mito <- colData(sce)$subsets_MT_percent > qc.mito.threshold
attr(qc.mito, "thresholds")["lower"] <- -Inf
attr(qc.mito, "thresholds")["higher"] <- qc.mito.threshold

attr(qc.mito, "thresholds")
##  lower higher 
##   -Inf     10

Summarize the number of cells removed for each reason.

discard <- qc.lib | qc.expr | qc.mito
colData(sce)$discard <- discard # Add a column in colData to store QC filter result
DataFrame(LibSize = sum(qc.lib), ExprGene = sum(qc.expr), MitoProp = sum(qc.mito), 
      Total = sum(discard))
## DataFrame with 1 row and 4 columns
##     LibSize  ExprGene  MitoProp     Total
##   <integer> <integer> <integer> <integer>
## 1         0         0         0         0

Assess QC metrics on cells

Now we are ready to inspect the distributions of various metrics and the thresholds chosen earlier. In an ideal case, we will see these metrics follow normal distributions and thus would justify the 3 MAD thresholds used in outlier detection. Afer assessing the plots, we can decide if the thresholds need adjustment to account for specific biological states or subset of cells, etc.

Cell counts

The cell counts are determined by the number of unique cellular barcodes detected. Ideally, the number of unique cellular barcodes will correpsond to the number of cells loaded. However the capture rates or capture efficiency of cells will affect the eventual cell counts. Accurate measure the input cell concentration is also important in determining the cell capture efficiency. Lastly, it is also possible to detect cell numbers that are much higher than what we loaded due to the experimental procedure. For example, there is a chance of obtaining only a barcoded bead in the emulsion droplet (GEM) and no actual cell with the 10X protocol.

Note: What were the expected cell counts in samples?

table(colData(sce)$Sample)
## 
##  G_1  G_2  L_1  L_2 NP_1 NP_2 PI_1 PI_2 
## 2915 3106 5906 3697 2249 2127 1500 4306
ggplot(as.data.frame(colData(sce)), aes(Sample, fill = Sample)) + geom_bar(color = "black") + 
    geom_text(stat = "count", aes(label = ..count..), vjust = -0.5, size = 5) +
    scale_fill_manual(values = sample_col) + theme_classic(base_size = 12) + 
    theme(legend.position = "none") + ylab("Counts") + ggtitle("Cell counts")

Library size

Next, we will consider the total number of RNA molecules detected per cell. The unique molecular identifier (UMI) counts should generally be above 500 (ref). Wells with few transcripts are likely to have been broken or failed to capture a cell, and should thus be removed. If UMI counts are between 500-1000 counts, it is usable but the cells probably should have been sequenced more deeply (ref).

View library size distributions with quantile.

Note: Are there any cells with low total UMI counts?

quantile(colData(sce)$sum, seq(0, 1, 0.1))  # 0% - 100% percentile
##       0%      10%      20%      30%      40%      50%      60%      70%      80%      90% 
##   1724.0   2880.0   3704.0   4576.5   5359.0   6285.5   7434.0   8961.5  11088.0  15572.0 
##     100% 
## 116555.0
quantile(colData(sce)$sum, seq(0.9, 1, 0.1))    # 90% - 100% percentile (high read-depth)
##    90%   100% 
##  15572 116555

Visualise library size distributions with ggplot.

logbreak = scales::trans_breaks("log10", function(x) 10^x)
loglab = scales::trans_format("log10", scales::math_format(10^.x))

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(sum, color = Sample, fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
    scale_x_log10(breaks = logbreak, labels = loglab) +
    geom_vline(xintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("log10 Total UMI counts") + 
    ylab("Frequency") + ggtitle("Histogram of Library Size per Cell")

# violin plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, sum, colour = discard)) +
    geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
    geom_hline(yintercept = attr(qc.lib, "thresholds")[1], linetype = 2) + # show cutoff
    scale_color_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
    theme(legend.position = "right") + ylab("log10 Total UMI counts") + 
    ggtitle("Violin plot of Library Size perl Cell")

multiplot(p1, p2)

Expressed features

Aside from having sufficient sequencing depth for each sample, we also expected to see reads distributed across the transcriptome. When visualised the expressed features (i.e. genes) in all the cells as a histogram or density plot, the plot should contain a single large peak for high quality data.

View expressed features distributions with quantile.

quantile(colData(sce)$detected, seq(0, 1, 0.1))
##   0%  10%  20%  30%  40%  50%  60%  70%  80%  90% 100% 
##  604 1223 1473 1731 1948 2167 2407 2710 3122 3786 8795

Visualise expressed features distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(detected, color = Sample, fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) + 
    scale_x_log10(breaks = logbreak, labels = loglab) +
    geom_vline(xintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("log10 Total expressed genes") + 
    ylab("Frequency") + ggtitle("Histogram of Expressed Features per Cell")

# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, detected, colour = discard)) +
    geom_violin(width = 1) + scale_y_log10(breaks = logbreak, labels = loglab) +
    geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
    geom_hline(yintercept = attr(qc.expr, "thresholds")[1], linetype = 2) + # show cutoff
    scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
    theme(legend.position = "right") + ylab("log10 Total expressed genes") + 
    ggtitle("Violin plot of Expressed Features per Cell")

multiplot(p1, p2)

Complexity of RNA species

The UMI count per cell and the number of genes detected per cell are often evaluated together. These two indices are usually strongly related, i.e. the higher UMI count for a cell, the more genes are detected as well. Cells that have a less complex RNA species (low number of genes detected per UMI), such as red blood cells, can often be detected by this metric. Generally, we expect the complexity score to be above 0.80 ref.

View complexity distributions with quantile.

quantile(colData(sce)$log10_genes_per_umi, seq(0, 1, 0.1))
##       0%      10%      20%      30%      40%      50%      60%      70%      80%      90% 
## 1.065505 1.114984 1.122271 1.128256 1.133894 1.139502 1.145985 1.153267 1.162562 1.176168 
##     100% 
## 1.350996

Visualise complexity distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(log10_genes_per_umi, color = Sample,
                                              fill = Sample)) +
        geom_histogram(position = "identity", alpha = 0.4, bins = 50) + 
    geom_vline(xintercept = 0.8, linetype = 2) +
        scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
        theme_classic(base_size = 12) + xlab("log10 Gene per UMI") + ylab("Frequency") +
        ggtitle("Histogram of Complexity of Gene Expression")

# scatter plot
p2 <- ggplot(as.data.frame(colData(sce)), aes(log10_sum, log10_detected, color = Sample)) + 
    geom_point(size = 0.6, alpha = 0.3) + facet_wrap(~ as.data.frame(colData(sce))$Sample) + 
    scale_color_manual(values = sample_col) + theme_classic(base_size = 12) +
    guides(color = guide_legend("Sample", override.aes = list(size = 3, alpha = 1))) +
    xlab("log10 Total UMI counts") + ylab("log10 Total expressed genes") + 
    ggtitle("UMIs vs. Expressed Genes")

multiplot(p1, p2)

Mitochondrial contamination

The mitochondrial proportions in cells is an useful indicator of cell quality. High proportion of counts assigned to mitochondrial genes is an indication of damaged, dying and dead cells, whereby cytoplasmic mRNA has leaked out through a broken membrane, hence only the mRNA located in the mitochondria is preserved and being sequenced.

View distributions with quantile.

Note: Are there any cells with high expression of mitochondrial genes (>20% of total counts in a cell)?

quantile(colData(sce)$subsets_MT_percent, seq(0, 1, 0.1))
##        0%       10%       20%       30%       40%       50%       60%       70%       80% 
## 0.0000000 0.4207468 0.5272408 0.6069388 0.6817726 0.7510203 0.8267616 0.9212001 1.0394179 
##       90%      100% 
## 1.2505218 8.2174054

Visualise distributions with ggplot.

# histogram
p1 <- ggplot(as.data.frame(colData(sce)), aes(x = subsets_MT_percent, color = Sample, 
                           fill = Sample)) +
    geom_histogram(position = "identity", alpha = 0.4, bins = 50) +
    geom_vline(xintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
    scale_fill_manual(values = sample_col) + scale_color_manual(values = sample_col) +
    theme_classic(base_size = 12) + xlab("% counts from mitochondrial genes") + 
    ylab("Frequency") + ggtitle("Histogram of Mitochondrial Proportions per Cell")

# violin
p2 <- ggplot(as.data.frame(colData(sce)), aes(Sample, subsets_MT_percent, colour = discard)) +
        geom_violin(width = 1) + geom_quasirandom(size = 0.2, alpha = 0.2, width = 0.5) +
        geom_hline(yintercept = attr(qc.mito, "thresholds")[2], linetype = 2) + # show cutoff
        scale_colour_manual(values = c("blue", "red")) + theme_classic(base_size = 12) +
        theme(legend.position = "right") + ylab("% counts from mitochondrial genes") +
        ggtitle("Violin plot of Mitochondrial Proportions per Cell")

multiplot(p1, p2)

Session information

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-conda_cos6-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/ihsuan/miniconda3/envs/jupyterlab/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C               LC_TIME=en_GB.UTF-8       
##  [4] LC_COLLATE=en_GB.UTF-8     LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods  
## [9] base     
## 
## other attached packages:
##  [1] ggbeeswarm_0.6.0            scater_1.14.6               ggplot2_3.3.2              
##  [4] scRNAseq_2.0.2              SingleCellExperiment_1.8.0  SummarizedExperiment_1.16.1
##  [7] DelayedArray_0.12.3         BiocParallel_1.20.1         matrixStats_0.56.0         
## [10] Biobase_2.46.0              GenomicRanges_1.38.0        GenomeInfoDb_1.22.1        
## [13] IRanges_2.20.2              S4Vectors_0.24.4            BiocGenerics_0.32.0        
## [16] knitr_1.29                 
## 
## loaded via a namespace (and not attached):
##  [1] viridis_0.5.1                 httr_1.4.2                   
##  [3] BiocSingular_1.2.2            viridisLite_0.3.0            
##  [5] bit64_4.0.5                   AnnotationHub_2.18.0         
##  [7] DelayedMatrixStats_1.8.0      shiny_1.5.0                  
##  [9] assertthat_0.2.1              interactiveDisplayBase_1.24.0
## [11] BiocManager_1.30.10           BiocFileCache_1.10.2         
## [13] blob_1.2.1                    vipor_0.4.5                  
## [15] GenomeInfoDbData_1.2.2        yaml_2.2.1                   
## [17] BiocVersion_3.10.1            pillar_1.4.6                 
## [19] RSQLite_2.2.0                 lattice_0.20-41              
## [21] glue_1.4.2                    digest_0.6.25                
## [23] promises_1.1.1                XVector_0.26.0               
## [25] colorspace_1.4-1              cowplot_1.0.0                
## [27] htmltools_0.5.0               httpuv_1.5.4                 
## [29] Matrix_1.2-18                 pkgconfig_2.0.3              
## [31] zlibbioc_1.32.0               purrr_0.3.4                  
## [33] xtable_1.8-4                  scales_1.1.1                 
## [35] later_1.1.0.1                 tibble_3.0.3                 
## [37] farver_2.0.3                  generics_0.0.2               
## [39] ellipsis_0.3.1                withr_2.2.0                  
## [41] magrittr_1.5                  crayon_1.3.4                 
## [43] mime_0.9                      memoise_1.1.0                
## [45] evaluate_0.14                 beeswarm_0.2.3               
## [47] tools_3.6.3                   lifecycle_0.2.0              
## [49] stringr_1.4.0                 munsell_0.5.0                
## [51] irlba_2.3.3                   AnnotationDbi_1.48.0         
## [53] compiler_3.6.3                rsvd_1.0.3                   
## [55] rlang_0.4.7                   grid_3.6.3                   
## [57] RCurl_1.98-1.2                BiocNeighbors_1.4.2          
## [59] rappdirs_0.3.1                labeling_0.3                 
## [61] bitops_1.0-6                  rmarkdown_2.3                
## [63] ExperimentHub_1.12.0          codetools_0.2-16             
## [65] gtable_0.3.0                  DBI_1.1.0                    
## [67] curl_4.3                      R6_2.4.1                     
## [69] gridExtra_2.3                 dplyr_1.0.2                  
## [71] fastmap_1.0.1                 bit_4.0.4                    
## [73] stringi_1.4.6                 Rcpp_1.0.5                   
## [75] vctrs_0.3.4                   dbplyr_1.4.4                 
## [77] tidyselect_1.1.0              xfun_0.16